rm(list = ls())
library(tidyverse, quietly = T, verbose = F)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
library(wesanderson, quietly = T, verbose = F)
library(patchwork, quietly = T, verbose = F)
library(tidysq, quietly = T, verbose = F) #read fasta file
##
## Attaching package: 'tidysq'
##
## The following object is masked from 'package:dplyr':
##
## collapse
##
## The following object is masked from 'package:base':
##
## paste
library(readxl, quietly = T, verbose = F)
library(IRanges, quietly = T, verbose = F)
##
## Attaching package: 'BiocGenerics'
##
## The following object is masked from 'package:tidysq':
##
## paste
##
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
##
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
##
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, table,
## tapply, union, unique, unsplit, which.max, which.min
##
##
## Attaching package: 'S4Vectors'
##
## The following objects are masked from 'package:lubridate':
##
## second, second<-
##
## The following objects are masked from 'package:dplyr':
##
## first, rename
##
## The following object is masked from 'package:tidyr':
##
## expand
##
## The following object is masked from 'package:utils':
##
## findMatches
##
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
##
##
## Attaching package: 'IRanges'
##
## The following objects are masked from 'package:tidysq':
##
## collapse, reverse
##
## The following object is masked from 'package:lubridate':
##
## %within%
##
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
##
## The following object is masked from 'package:purrr':
##
## reduce
library(pheatmap)
library(ggplotify)
source("00_functions_2024.R")
enst_expr <- read_rds("../results/01_wrangled_data/01_enst_expr.rds")
ensg_expr <- read_rds("../results/01_wrangled_data/01_ensg_expr.rds")
load(file = "../data/gte.Rdata") # transcripts for each gene + genomic coordinates
pheno_nh <- read_rds("../results/01_wrangled_data/01_pheno_no_haematological_cancers.rds")
pheno <- read_rds("../data/pheno.rds")
hgnc_from_ensembl <- readRDS(file = "../results/001_hgnc_from_ensembl_biomart.rds")
list_of_categories_nh <- unique(pheno_nh$TCGA_GTEX_main_category)
list_of_categories <- unique(pheno$TCGA_GTEX_main_category)
# # find the gene in ensg_exp with the highest number of isoforms in order to
# # decide the dimensions of the boxplot
#
# l <- vector(mode = "numeric", length = nrow(ensg_expr))
# for(i in 1:nrow(ensg_expr)) {
# ensg <- rownames(ensg_expr)[i]
# l[i] <- length(names(gte_list[[ensg]]))
#
# maxl <- (max(l))
#
# }
For each target we want to evaluate the following:
# to view the categories, print list_of_categories
# import transcript data from ensembl website --> I want to keep only the protein coding transcripts
EGFR_transcripts <- read_xlsx("../data/isoforms/transcripts_EGFR.xlsx",
col_names = TRUE, skip = 1) # skip = 1 is necessary to have colnames
# select the tissues and cancer of interest and find isoforms expression in those tissues
EGFR_isoforms <- isoforms_selected_tissues(hgnc = "EGFR", TCGA_interest = c("glioblastoma multiforme", "lung squamous cell carcinoma", "lung adenocarcinoma", "breast invasive carcinoma", "stomach adenocarcinoma", "liver hepatocellular carcinoma"), GTEX_interest = c("heart", "kidney", "liver", "lung"), gte_list = gte_list, pheno = pheno_nh, enst_expr = enst_expr, hgnc_from_ensembl = hgnc_from_ensembl, ensembl_table = EGFR_transcripts)
# plot the tcga and gtex of interest
lt <- EGFR_isoforms$gtex$enst_id %>% unique() %>% length()
palette <- wes_palette("FantasticFox1", lt, "continuous")
plot_tcga <- ggplot(EGFR_isoforms$tcga,
mapping = aes(x = category, y = expression_level, fill = enst_id)) +
geom_boxplot(alpha = 0.9, outlier.shape = NA, width = 0.9) +
scale_fill_manual(values = palette) +
ylim(0, max(EGFR_isoforms$tcga$expression_level)) +
theme_light() +
xlab(" ") +
ylab("Log2(TPM)") +
theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 35),
axis.text.y = element_text(size = 30),
axis.title = element_text(size = 30),
plot.title = element_text(size = 50),
legend.text = element_text(size = 38),
legend.title = element_text(size = 50),
plot.margin = unit(c(1,1,1,10), "cm"),
legend.position = "bottom") +
ggtitle(label = "EGFR expression for TCGA cancers")
plot_gtex <- ggplot(EGFR_isoforms$gtex,
mapping = aes(x = category, y = expression_level, fill = enst_id)) +
geom_boxplot(alpha = 0.9, outlier.shape = NA, width = 0.9) +
scale_fill_manual(values = palette) +
ylim(0, max(EGFR_isoforms$tcga$expression_level)) +
theme_light() +
xlab(" ") +
ylab("Log2(TPM)") +
theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 35),
axis.text.y = element_text(size = 30),
axis.title = element_text(size = 30),
plot.title = element_text(size = 50),
# legend.text = element_text(size = 30),
# legend.title = element_text(size = 35),
plot.margin = unit(c(1,6,1,4), "cm"),
legend.position = "none") +
ggtitle(label = paste("EGFR expression for GTEX normal tissues"))
(plot <- plot_tcga + plot_gtex)
ggsave(filename = "EGFR_isoforms_expression_selected_tissues_plot.pdf", plot = plot, device = "pdf",
path = "../results/plots/isoform_expression/",
width = 120, height = 55, units = "cm", limitsize = F)
deeptmhmm_output <- read.table("../results/Deeptmhmm/canonical_and_isoforms/EGFR", fill = T)
# import object (created using the msa script)
aligned_sequences <- read_fasta(paste("../results/aligned_sequences/EGFR_msa.txt"))
(EGFR_topology <- protein_properties(hgnc = "EGFR", hgnc_from_ensembl = hgnc_from_ensembl, ensembl_table = EGFR_transcripts, deeptmhmm_output = deeptmhmm_output, aligned_sequences = aligned_sequences))
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 7 rows [1, 2, 3, 4, 5, 6,
## 7].
Import the table coming from DeepLoc2. Delete the variants that we should not be considering (we keep the same as before, that should be the protein coding). Make a heatmap with variant name on the rows and localization in the columns, colored by probability.
EGFR_deeploc <- read_csv(file = "../results/DeepLoc/EGFR_with_variants.csv")
## Rows: 38 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Protein_ID, Localizations, Signals
## dbl (10): Cytoplasm, Nucleus, Extracellular, Cell membrane, Mitochondrion, P...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# remove non-relevant columns
EGFR_deeploc <- EGFR_deeploc %>% dplyr::select(-Localizations, -Signals)
# filter out the variants that we don't need (based on what we used in the previous analyses)
keep <- EGFR_transcripts$`UniProt Match` %>% str_subset(pattern = "^-$", negate = TRUE)
EGFR_deeploc <- EGFR_deeploc %>% filter(str_detect(Protein_ID, paste(keep, collapse = "|"))) %>%
mutate(Protein_ID = str_extract(Protein_ID, "(?<=_)[^_]+_[^_]+")) %>%
separate(Protein_ID, into = c("Protein_ID", NA), sep = "_")
# make heatmap with DeepLoc2 probabilities
# add the ensemble IDs to the EGFR_deeploc dataframe
ensg <- EGFR_transcripts %>% dplyr::select(`Transcript ID`, `UniProt Match`) %>%
separate(., col = `Transcript ID`, into = c("transcript", NA), sep = "\\.")
EGFR_deeploc <- ensg %>% right_join(., EGFR_deeploc, by = c("UniProt Match" = "Protein_ID"))
EGFR_heatmap <- EGFR_deeploc %>% column_to_rownames("transcript") %>% dplyr::select(-`UniProt Match`)
loc_heatmap <- pheatmap(EGFR_heatmap,
cluster_cols = F, cluster_rows = F)
loc_heatmap <- as.ggplot(loc_heatmap)
ggsave(filename = "EGFR_deeploc2_heatmap.pdf", plot = loc_heatmap, device = "pdf",
path = "../results/plots/subcellular_localization/",
width = 20, height = 10, units = "cm", limitsize = T)
# to view the categories, print list_of_categories
# import transcript data from ensembl website --> I want to keep only the protein coding transcripts
CD7_transcripts <- read_xlsx("../data/isoforms/transcripts_CD7.xlsx",
col_names = TRUE, skip = 1) # skip = 1 is necessary to have colnames
# select the tissues and cancer of interest and find isoforms expression in those tissues
CD7_isoforms <- isoforms_selected_tissues(hgnc = "CD7", TCGA_interest = c("acute myeloid leukemia", "diffuse large b-cell lymphoma"), GTEX_interest = c("lung", "small intestine", "spleen", "stomach"), gte_list = gte_list, pheno = pheno, enst_expr = enst_expr, hgnc_from_ensembl = hgnc_from_ensembl, ensembl_table = CD7_transcripts)
# (plot <- boxplot_isoforms_all_tissues(hgnc = "CD7", gte_list = gte_list, pheno = pheno, enst_expr = enst_expr, hgnc_from_ensembl = hgnc_from_ensembl))
# plot the tcga and gtex of interest
lt <- CD7_isoforms$gtex$enst_id %>% unique() %>% length()
palette <- wes_palette("FantasticFox1", lt, "continuous")
plot_tcga <- ggplot(CD7_isoforms$tcga,
mapping = aes(x = category, y = expression_level, fill = enst_id)) +
geom_boxplot(alpha = 0.9, outlier.shape = NA, width = 0.9) +
scale_fill_manual(values = palette) +
ylim(0, max(CD7_isoforms$tcga$expression_level)) +
theme_light() +
xlab(" ") +
ylab("Log2(TPM)") +
theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 35),
axis.text.y = element_text(size = 30),
axis.title = element_text(size = 30),
plot.title = element_text(size = 50),
legend.text = element_text(size = 30),
legend.title = element_text(size = 35),
plot.margin = unit(c(1,10,1,10), "cm"),
legend.position = "bottom") +
ggtitle(label = "CD7 expression for TCGA cancers")
plot_gtex <- ggplot(CD7_isoforms$gtex,
mapping = aes(x = category, y = expression_level, fill = enst_id)) +
geom_boxplot(alpha = 0.9, outlier.shape = NA, width = 0.9) +
scale_fill_manual(values = palette) +
ylim(0, max(CD7_isoforms$tcga$expression_level)) +
theme_light() +
xlab(" ") +
ylab("Log2(TPM)") +
theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 35),
axis.text.y = element_text(size = 30),
axis.title = element_text(size = 30),
plot.title = element_text(size = 50),
# legend.text = element_text(size = 30),
# legend.title = element_text(size = 35),
plot.margin = unit(c(1,6,1,4), "cm"),
legend.position = "none") +
ggtitle(label = paste("CD7 expression for GTEX normal tissues"))
(plot <- plot_tcga + plot_gtex)
ggsave(filename = "CD7_isoforms_expression_selected_tissues_plot.pdf", plot = plot, device = "pdf",
path = "../results/plots/isoform_expression/",
width = 120, height = 55, units = "cm", limitsize = F)
deeptmhmm_output <- read.table("../results/Deeptmhmm/canonical_and_isoforms/CD7", fill = T)
# import object (created using the msa script)
aligned_sequences <- read_fasta(paste("../results/aligned_sequences/CD7_msa.txt"))
(CD7_topology <- protein_properties(hgnc = "CD7", hgnc_from_ensembl = hgnc_from_ensembl, ensembl_table = CD7_transcripts, deeptmhmm_output = deeptmhmm_output, aligned_sequences = aligned_sequences))
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 4 rows [2, 3, 4,
## 5].
CD7_deeploc <- read_csv(file = "../results/DeepLoc/CD7_with_variants.csv")
## Rows: 5 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Protein_ID, Localizations, Signals
## dbl (10): Cytoplasm, Nucleus, Extracellular, Cell membrane, Mitochondrion, P...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# remove non-relevant columns
CD7_deeploc <- CD7_deeploc %>% dplyr::select(-Localizations, -Signals)
# filter out the variants that we don't need (based on what we used in the previous analyses)
keep <- CD7_transcripts$`UniProt Match` %>% str_subset(pattern = "^-$", negate = TRUE)
keep[1] <- "P09564" # there are two uniprot IDs corresponding to this variant. I am keeping the one that matches the CD7_deeploc entry.
CD7_deeploc <- CD7_deeploc %>% filter(str_detect(Protein_ID, paste(keep, collapse = "|"))) %>%
mutate(Protein_ID = str_extract(Protein_ID, "(?<=_)[^_]+_[^_]+")) %>%
separate(Protein_ID, into = c("Protein_ID", NA), sep = "_")
# make heatmap with DeepLoc2 probabilities
# add the ensemble IDs to the CD7_deeploc dataframe
ensg <- CD7_transcripts %>% dplyr::select(`Transcript ID`, `UniProt Match`) %>%
separate(., col = `Transcript ID`, into = c("transcript", NA), sep = "\\.")
ensg[1, 2] <- keep[1] # same as before. There are two uniprot IDs corresponding to this variant. I am keeping the one that matches the CD7_deeploc entry.
CD7_deeploc <- ensg %>% right_join(., CD7_deeploc, by = c("UniProt Match" = "Protein_ID"))
CD7_heatmap <- CD7_deeploc %>% column_to_rownames("transcript") %>% dplyr::select(-`UniProt Match`)
loc_heatmap <- pheatmap(CD7_heatmap,
cluster_cols = F, cluster_rows = F)
loc_heatmap <- as.ggplot(loc_heatmap)
ggsave(filename = "CD7_deeploc2_heatmap.pdf", plot = loc_heatmap, device = "pdf",
path = "../results/plots/subcellular_localization/",
width = 20, height = 10, units = "cm", limitsize = T)
# to view the categories, print list_of_categories
# import transcript data from ensembl website --> I want to keep only the protein coding transcripts
MSLN_transcripts <- read_xlsx("../data/isoforms/transcripts_MSLN.xlsx",
col_names = TRUE, skip = 1) # skip = 1 is necessary to have colnames
# select the tissues and cancer of interest and find isoforms expression in those tissues
MSLN_isoforms <- isoforms_selected_tissues(hgnc = "MSLN", TCGA_interest = c("lung squamous cell carcinoma", "mesothelioma", "colon adenocarcinoma", "rectum adenocarcinoma", "pancreatic adenocarcinoma", "ovarian serous cystadenocarcinoma", "stomach adenocarcinoma", "cholangiocarcinoma", "cervical & endocervical cancer", "uterine corpus endometrioid carcinoma", "breast invasive carcinoma"), GTEX_interest = c("lung", "kidney", "uterus", "fallopian tube", "heart"), gte_list = gte_list, pheno = pheno_nh, enst_expr = enst_expr, hgnc_from_ensembl = hgnc_from_ensembl, ensembl_table = MSLN_transcripts)
# (plot <- boxplot_isoforms_all_tissues(hgnc = "MSLN", gte_list = gte_list, pheno = pheno_nh, enst_expr = enst_expr, hgnc_from_ensembl = hgnc_from_ensembl))
# plot the tcga and gtex of interest
lt <- MSLN_isoforms$gtex$enst_id %>% unique() %>% length()
palette <- wes_palette("FantasticFox1", lt, "continuous")
plot_tcga <- ggplot(MSLN_isoforms$tcga,
mapping = aes(x = category, y = expression_level, fill = enst_id)) +
geom_boxplot(alpha = 0.9, outlier.shape = NA, width = 0.9) +
scale_fill_manual(values = palette) +
ylim(0, max(MSLN_isoforms$tcga$expression_level)) +
theme_light() +
xlab(" ") +
ylab("Log2(TPM)") +
theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 35),
axis.text.y = element_text(size = 30),
axis.title = element_text(size = 30),
plot.title = element_text(size = 50),
legend.text = element_text(size = 30),
legend.title = element_text(size = 32),
plot.margin = unit(c(1,6,1,10), "cm"),
legend.position = "bottom") +
ggtitle(label = "MSLN expression for TCGA cancers")
plot_gtex <- ggplot(MSLN_isoforms$gtex,
mapping = aes(x = category, y = expression_level, fill = enst_id)) +
geom_boxplot(alpha = 0.9, outlier.shape = NA, width = 0.9) +
scale_fill_manual(values = palette) +
ylim(0, max(MSLN_isoforms$tcga$expression_level)) +
theme_light() +
xlab(" ") +
ylab("Log2(TPM)") +
theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 35),
axis.text.y = element_text(size = 30),
axis.title = element_text(size = 30),
plot.title = element_text(size = 50),
# legend.text = element_text(size = 30),
# legend.title = element_text(size = 35),
plot.margin = unit(c(1,6,1,4), "cm"),
legend.position = "none") +
ggtitle(label = paste("MSLN expression for GTEX normal tissues"))
(plot <- plot_tcga + plot_gtex + plot_layout(widths = c(2, 1)))
ggsave(filename = "MSLN_isoforms_expression_selected_tissues_plot.pdf", plot = plot, device = "pdf",
path = "../results/plots/isoform_expression/",
width = 140, height = 55, units = "cm", limitsize = F)
deeptmhmm_output <- read.table("../results/Deeptmhmm/canonical_and_isoforms/MSLN", fill = T)
# import object (created using the msa script)
aligned_sequences <- read_fasta("../results/aligned_sequences/MSLN_msa.txt")
(MSLN_topology <- protein_properties(hgnc = "MSLN", hgnc_from_ensembl = hgnc_from_ensembl, ensembl_table = MSLN_transcripts, deeptmhmm_output = deeptmhmm_output, aligned_sequences = aligned_sequences))
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 8 rows [1, 2, 3, 4, 5, 6,
## 7, 8].
MSLN_deeploc <- read_csv(file = "../results/DeepLoc/MSLN_with_variants.csv")
## Rows: 8 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Protein_ID, Localizations, Signals
## dbl (10): Cytoplasm, Nucleus, Extracellular, Cell membrane, Mitochondrion, P...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# remove non-relevant columns
MSLN_deeploc <- MSLN_deeploc %>% dplyr::select(-Localizations, -Signals)
# filter out the variants that we don't need (based on what we used in the previous analyses)
keep <- MSLN_transcripts$`UniProt Match` %>% str_subset(pattern = "^-$", negate = TRUE)
MSLN_deeploc <- MSLN_deeploc %>% filter(str_detect(Protein_ID, paste(keep, collapse = "|"))) %>%
mutate(Protein_ID = str_extract(Protein_ID, "(?<=_)[^_]+_[^_]+")) %>%
separate(Protein_ID, into = c("Protein_ID", NA), sep = "_")
# make heatmap with DeepLoc2 probabilities
# add the ensemble IDs to the MSLN_deeploc dataframe
ensg <- MSLN_transcripts %>% dplyr::select(`Transcript ID`, `UniProt Match`) %>%
separate(., col = `Transcript ID`, into = c("transcript", NA), sep = "\\.")
MSLN_deeploc <- ensg %>% right_join(., MSLN_deeploc, by = c("UniProt Match" = "Protein_ID")) %>%
drop_na()
MSLN_heatmap <- MSLN_deeploc %>% column_to_rownames("transcript") %>% dplyr::select(-`UniProt Match`)
loc_heatmap <- pheatmap(MSLN_heatmap,
cluster_cols = F, cluster_rows = F)
loc_heatmap <- as.ggplot(loc_heatmap)
ggsave(filename = "MSLN_deeploc2_heatmap.pdf", plot = loc_heatmap, device = "pdf",
path = "../results/plots/subcellular_localization/",
width = 20, height = 12, units = "cm", limitsize = T)
# to view the categories, print list_of_categories
# import transcript data from ensembl website --> I want to keep only the protein coding transcripts
CD19_transcripts <- read_xlsx("../data/isoforms/transcripts_CD19.xlsx",
col_names = TRUE, skip = 1) # skip = 1 is necessary to have colnames
# select the tissues and cancer of interest and find isoforms expression in those tissues
CD19_isoforms <- isoforms_selected_tissues(hgnc = "CD19", TCGA_interest = c("acute myeloid leukemia", "diffuse large b-cell lymphoma", "glioblastoma multiforme", "uveal melanoma", "pheochromocytoma & paraganglioma", "adrenocortical cancer", "thyroid carcinoma", "breast invasive carcinoma", "lung squamous cell carcinoma", "lung adenocarcinoma", "sarcoma", "pancreatic adenocarcinoma"), GTEX_interest = c("small intestine", "spleen", "stomach", "testis"), gte_list = gte_list, pheno = pheno_nh, enst_expr = enst_expr, hgnc_from_ensembl = hgnc_from_ensembl, ensembl_table = CD19_transcripts)
# (plot <- boxplot_isoforms_all_tissues(hgnc = "CD19", gte_list = gte_list, pheno = pheno, enst_expr = enst_expr, hgnc_from_ensembl = hgnc_from_ensembl))
# plot the tcga and gtex of interest
lt <- CD19_isoforms$gtex$enst_id %>% unique() %>% length()
palette <- wes_palette("FantasticFox1", lt, "continuous")
plot_tcga <- ggplot(CD19_isoforms$tcga,
mapping = aes(x = category, y = expression_level, fill = enst_id)) +
geom_boxplot(alpha = 0.9, outlier.shape = NA, width = 0.9) +
scale_fill_manual(values = palette) +
ylim(0, max(CD19_isoforms$tcga$expression_level)) +
theme_light() +
xlab(" ") +
ylab("Log2(TPM)") +
theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 35),
axis.text.y = element_text(size = 30),
axis.title = element_text(size = 30),
plot.title = element_text(size = 50),
legend.text = element_text(size = 30),
legend.title = element_text(size = 32),
plot.margin = unit(c(1,6,1,10), "cm"),
legend.position = "bottom") +
ggtitle(label = "CD19 expression for TCGA cancers")
plot_gtex <- ggplot(CD19_isoforms$gtex,
mapping = aes(x = category, y = expression_level, fill = enst_id)) +
geom_boxplot(alpha = 0.9, outlier.shape = NA, width = 0.9) +
scale_fill_manual(values = palette) +
ylim(0, max(CD19_isoforms$gtex$expression_level)) +
theme_light() +
xlab(" ") +
ylab("Log2(TPM)") +
theme(axis.text.x = element_text(angle = 40, hjust = 1, size = 35),
axis.text.y = element_text(size = 30),
axis.title = element_text(size = 30),
plot.title = element_text(size = 50),
# legend.text = element_text(size = 30),
# legend.title = element_text(size = 35),
plot.margin = unit(c(1,6,1,4), "cm"),
legend.position = "none") +
ggtitle(label = paste("CD19 expression for GTEX normal tissues"))
(plot <- plot_tcga + plot_gtex + plot_layout(widths = c(2, 1)))
ggsave(filename = "CD19_isoforms_expression_selected_tissues_plot.pdf", plot = plot, device = "pdf",
path = "../results/plots/isoform_expression/",
width = 120, height = 55, units = "cm", limitsize = F)
deeptmhmm_output <- read.table("../results/Deeptmhmm/canonical_and_isoforms/CD19", fill = T)
# import object (created using the msa script)
aligned_sequences <- read_fasta("../results/aligned_sequences/CD19_msa.txt")
(CD19_topology <- protein_properties(hgnc = "CD19", hgnc_from_ensembl = hgnc_from_ensembl, ensembl_table = CD19_transcripts, deeptmhmm_output = deeptmhmm_output, aligned_sequences = aligned_sequences))
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 2 rows [1, 2].
CD19_deeploc <- read_csv(file = "../results/DeepLoc/CD19_with_variants.csv")
## Rows: 6 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Protein_ID, Localizations, Signals
## dbl (10): Cytoplasm, Nucleus, Extracellular, Cell membrane, Mitochondrion, P...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# remove non-relevant columns
CD19_deeploc <- CD19_deeploc %>% dplyr::select(-Localizations, -Signals)
# filter out the variants that we don't need (based on what we used in the previous analyses)
keep <- CD19_transcripts$`UniProt Match` %>% str_subset(pattern = "^-$", negate = TRUE)
CD19_deeploc <- CD19_deeploc %>% filter(str_detect(Protein_ID, paste(keep, collapse = "|"))) %>%
mutate(Protein_ID = str_extract(Protein_ID, "(?<=_)[^_]+_[^_]+")) %>%
separate(Protein_ID, into = c("Protein_ID", NA), sep = "_")
# make heatmap with DeepLoc2 probabilities
# add the ensemble IDs to the MSLN_deeploc dataframe
ensg <- CD19_transcripts %>% dplyr::select(`Transcript ID`, `UniProt Match`) %>%
separate(., col = `Transcript ID`, into = c("transcript", NA), sep = "\\.")
CD19_deeploc <- ensg %>% right_join(., CD19_deeploc, by = c("UniProt Match" = "Protein_ID"))
CD19_heatmap <- CD19_deeploc %>% column_to_rownames("transcript") %>% dplyr::select(-`UniProt Match`)
loc_heatmap <- pheatmap(CD19_heatmap,
cluster_cols = F, cluster_rows = F)
loc_heatmap <- as.ggplot(loc_heatmap)
ggsave(filename = "CD19_deeploc2_heatmap.pdf", plot = loc_heatmap, device = "pdf",
path = "../results/plots/subcellular_localization/",
width = 20, height = 7, units = "cm", limitsize = T)